library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(readr)
library(networkD3)
library(visNetwork)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(sjPlot)
## Registered S3 method overwritten by 'parameters':
##   method                         from      
##   format.parameters_distribution datawizard
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(wordcloud) 
## Loading required package: RColorBrewer
library(tidytext) 
library(DT) 
## 
## Attaching package: 'DT'
## The following object is masked from 'package:networkD3':
## 
##     JS
library(leaflet) 
## 
## Attaching package: 'leaflet'
## 
## The following object is masked from 'package:networkD3':
## 
##     JS
library(readr)
library(ggthemes)
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(stringr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(RColorBrewer)
library("data.table")
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
## 
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
## 
##     wind
register_google(key = "AIzaSyBlZS1dMHmz5qAZww5VQRBBSlJa3VFfyVg", write = TRUE)
## Replacing old key (AIzaSyBlZS1dMHmz5qAZww5VQRBBSlJa3VFfyVg) with new key in C:/Users/DELL/Documents/.Renviron

Furthermore, it is obvious that the locaion doesn’t separate the bars in different groups. Perhaps Vegas is not a big place, so the distance doesn’t matter.

## 8. Append: Network of bars
rb <- barreview %>%
  group_by(business_id) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  head(50)
r50 <- barreview %>%
   filter(business_id %in% rb$business_id) %>%
   select(user_id, business_id)

## One bar have a branch in very close location also listed in top50, so actually the number of bars is 49

r50 <- left_join(r50, bar, by = "business_id")
r50 <- r50[, c("user_id", "name", "neighborhood", "review_count")]
r50$name<-gsub('["]',"", r50$name)
r50$name <- ifelse(r50$name == "Bachi Burger" & r50$neighborhood == "Southeast",  "Bachi Burger(SE)", r50$name)
r50 <- mutate(r50, barnum = as.factor(name))
barlink <- NULL
 for (i in 1 : 48) {
   for (j in i : 48) {
      a <- r50 %>% filter(as.numeric(barnum) == i)
      a <- a$user_id
      b <- r50 %>% filter(as.numeric(barnum) == j)
      b <- b$user_id
      c <- intersect(a, b)
      d <- length(c)
      tmp <- c(i, j, d)
      barlink <- rbind(barlink, tmp)
   }
}
barlink <- as.data.frame(barlink)
barlink <- barlink %>% 
  filter(V1 != V2) %>% 
  filter(V3 > 0)
barlink <- rename(barlink, source = V1, target = V2, value = V3)
barlink2 <- barlink %>% filter(value >= 50)
barlink2$value <- barlink2$value / 50
barlink2$source = barlink2$source - 1
barlink2$target = barlink2$target - 1  
barnode <- r50[, c("barnum", "neighborhood", "review_count")] %>% 
     distinct()
barnode$neighborhood <- as.factor(barnode$neighborhood)
barnode$rcsize <- barnode$review_count / 100 - 10
p8 <- forceNetwork(Links = barlink2, 
             Nodes = barnode, 
             Source = "source",
             Target = "target", 
             Value = "value", 
             NodeID = "barnum",
             Nodesize = "rcsize",
             Group = "neighborhood", 
             opacity = 0.6, zoom = TRUE)
p8

This picture tells you the distribution of bars in Vegas’ neighborhood. The strip is leading, and Eastside and Downtown are also great. However, if we take the average review scores into account, maybe Downtown is the best area for entertainment.

neighborhood <- bar %>%
  group_by(neighborhood) %>%
  summarize(avgstar = mean(stars), count = n())
neighborhood[1, 1] <- "Not known"
neighborhoodgraph <- ggplot(neighborhood, aes(x = count, y = reorder(neighborhood, count), fill = round(avgstar, 2))) + 
  geom_bar(stat="identity", width=1, color="white") +
  labs(x="review count", y="name of neighborhood")

## 3. Which neighborhood has most bars?
p3 <- ggplotly(neighborhoodgraph)
p3

This table tells you if we define the bars who have in average more than 4 stars and greater than 100 reviews are the top bars, how many of all the bars in Vegas are the top bars. The result is that only 240 bars are top, and the rest 1139 are normal. The gap of average review counts in the two types of bars is huge. Even in Vegas, good bars are rare. But their popularity and quality are undoubtful.

topstar <- mutate(bar, topstar = ifelse(stars >= 4 & review_count >= 100, "Top bars", "Non top bars"))
toppct <- topstar %>%
  group_by(topstar) %>%
  summarize(count = n(), avgreview = mean(review_count))

## 4. The gap between topbars and non top bars
p4 <- tab_df(toppct)
p4
topstar count avgreview
Non top bars 1139 106.49
Top bars 240 529.58

In the distribution of top bars, we find Spring Valley and China town are great areas. They are easy to be ignored if we just concentrate to the location crowded with bars.

topstar <- topstar %>%
  filter(topstar == "Top bars") %>%
  group_by(neighborhood) %>%
  summarize(count = n(), avgstar = mean(stars))
topstar[1, 1] <- "Not known"
topbargraph <- ggplot(topstar, aes(x = count, y = reorder(neighborhood, count), fill = round(avgstar, 2))) + 
  geom_bar(stat="identity", width=1, color="white") +
  labs(x="review count", y="name of neighborhood")

## 5. Which neighborhood has most top bars?
p5 <- ggplotly(topbargraph)
p5

Then let us dig deeper to discover the relationship among the attributes of elite users who has reviewed in Vegas’ bars. It seems that you will be more picky when you experience more places. But you may also be more conservative to criticize when your number of fans increases. The first linear regression graph shows this trend.

## 6. What influences the elite users' review scores most who has reviewed in Las Vegas.
lm1 <- lm(average_stars ~ review_count + fans + useful + compliment_hot, elite)
summary(lm1)
## 
## Call:
## lm(formula = average_stars ~ review_count + fans + useful + compliment_hot, 
##     data = elite)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87542 -0.16202  0.01053  0.17069  0.75058 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.896e+00  1.803e-02 216.130  < 2e-16 ***
## review_count   -8.430e-05  1.598e-05  -5.274 1.77e-07 ***
## fans            1.525e-04  4.488e-05   3.397 0.000719 ***
## useful          1.548e-07  6.045e-07   0.256 0.798000    
## compliment_hot -2.032e-06  5.988e-06  -0.339 0.734405    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2631 on 718 degrees of freedom
## Multiple R-squared:  0.04107,    Adjusted R-squared:  0.03573 
## F-statistic: 7.689 on 4 and 718 DF,  p-value: 4.549e-06
userstars <- ggplot(data = elite, aes(x = fans, y = average_stars)) +
  stat_smooth(method = "lm", col = "blue") + 
  xlab("Number of fans") + ylab("Average stars")
p6 <- ggplotly(userstars)
## `geom_smooth()` using formula 'y ~ x'
p6

Map Visualization part

get Las Vegas Map

map_lv <- get_map("Las Vegas",
                            zoom = 12,
                            source = "stamen",
                            maptype = "toner-background")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=Las%20Vegas&zoom=12&size=640x640&scale=2&maptype=terrain&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Las+Vegas&key=xxx
## Source : http://tile.stamen.com/toner-background/12/736/1604.png
## Source : http://tile.stamen.com/toner-background/12/737/1604.png
## Source : http://tile.stamen.com/toner-background/12/738/1604.png
## Source : http://tile.stamen.com/toner-background/12/739/1604.png
## Source : http://tile.stamen.com/toner-background/12/736/1605.png
## Source : http://tile.stamen.com/toner-background/12/737/1605.png
## Source : http://tile.stamen.com/toner-background/12/738/1605.png
## Source : http://tile.stamen.com/toner-background/12/739/1605.png
## Source : http://tile.stamen.com/toner-background/12/736/1606.png
## Source : http://tile.stamen.com/toner-background/12/737/1606.png
## Source : http://tile.stamen.com/toner-background/12/738/1606.png
## Source : http://tile.stamen.com/toner-background/12/739/1606.png
## Source : http://tile.stamen.com/toner-background/12/736/1607.png
## Source : http://tile.stamen.com/toner-background/12/737/1607.png
## Source : http://tile.stamen.com/toner-background/12/738/1607.png
## Source : http://tile.stamen.com/toner-background/12/739/1607.png
ggmap(map_lv)

map bar location

g_location <- ggmap(map_lv) + theme_map()
g_location + geom_point(data=bar, aes(x=longitude,y=latitude),
                    size=0.3, alpha=0.3, color="blue")
## Warning: Removed 347 rows containing missing values (geom_point).

highlight the bar hot spot

g_density <- g_location + geom_density2d(aes(x=longitude,y=latitude), 
  data=bar, color="green", size=1, bins=12) +
  stat_density2d(aes(x=longitude,y=latitude,
    fill = ..level.., alpha = ..level..),
    data=bar, geom = 'polygon', bins=12) +
  scale_fill_gradient2(low = "green", mid="yellow", high = "red") +
  scale_alpha(range = c(0.00, 0.5)) 
g_density
## Warning: Removed 347 rows containing non-finite values (stat_density2d).
## Removed 347 rows containing non-finite values (stat_density2d).

# Visualize the neighborhood each bar belongs to

#add legend of stars
lvbar_map_neighborhood <- 
  leaflet(bar) %>%
  addTiles() %>%    # Add OpenStreetMap map tiles
  addCircles(lng = ~longitude, lat = ~latitude)

pal = colorFactor("Set1", domain = bar$neighborhood) # Grab a palette
color_neighborhood = pal(bar$neighborhood)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
lvbar_map_neighborhood %>% addCircles(color=color_neighborhood) %>%
  addLegend(pal = pal, values = ~bar$neighborhood, title = "Neighborhood")
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
content <- paste("Name:",bar$name,"<br/>",
                 "Address:",bar$address,"<br/>",
                 "Stars:",bar$stars,"<br/>",
                 "Neighborhood:", bar$neighborhood,"<br/>")

pal = colorFactor("YlOrRd", domain = bar$stars) # Grab a palette
color_stars = pal(bar$stars)

lvbar_map_neighborhood %>% addCircles(color=color_stars, popup = content) %>%
  addLegend(pal = pal, values = ~bar$stars, title = "Stars")
## Assuming "longitude" and "latitude" are longitude and latitude, respectively

Text Analysis Part

Most Popular Categories regarding bars:

fillColor = "#FFA07A"
fillColor2 = "#F1C40F"

categories = str_split(bar$categories,";")
categories = as.data.frame(unlist(categories))
colnames(categories) = c("Name")

categories %>%
  group_by(Name) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  ungroup() %>%
  mutate(Name = reorder(Name,Count)) %>%
  head(10) %>%
  
  
  ggplot(aes(x = Name,y = Count)) +
  geom_bar(stat='identity',colour="white", fill =fillColor2) +
  geom_text(aes(x = Name, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Name of Category', y = 'Count', 
       title = 'Top 10 Categories regarding bars') +
  coord_flip() + 
  theme_bw()

Bars with most number of five Star Reviews:

stars_5 <- barreview %>%
  filter(stars ==5) %>%
  group_by(business_id) %>%
  select(business_id,stars,text) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  ungroup() 


five = merge(stars_5, bar, by= "business_id")
five2 <- five %>%
  filter(stars ==5) %>%
  filter(is_open==1)

fivestar <- five2 %>%
  arrange(stars) %>%
  head(10)
fillColor2 = "#F1C40F"

fivestar %>%
  mutate(name = reorder(name,Count)) %>%
  ggplot(aes(x = name,y = Count)) +
  geom_bar(stat='identity',colour="white", fill = fillColor2) +
  geom_text(aes(x = name, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 2, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Name of the Bars', 
       y = 'Count', 
       title = 'Name of the bars and Count') +
  coord_flip() +
  theme_bw()

Most 5 starred bar - J Karaoke Bar:

J_karaoke = bar %>% filter(business_id == "3pSUr_cdrphurO6m1HMP9A") %>%
  select(name,neighborhood,city,state,postal_code,categories)

datatable(head(J_karaoke), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

A wordcloud to see the common words of reviews on “J Karaoke Bar”

createWordCloud = function(train)
{
  train %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word) %>%
  count(word,sort = TRUE) %>%
  ungroup()  %>%
  head(30) %>%
  
  with(wordcloud(word, n, max.words = 30,colors=brewer.pal(8, "Dark2")))
}

createWordCloud(barreview %>%
  filter(business_id == "3pSUr_cdrphurO6m1HMP9A"))

From the wordcloud, we can derive the ingisht that people praise the atmosphere, music, cleaning environment, services and food(especially chicken) in this bar, and indicates that they spend happy and comfortable time in this J Karaoke bar.

Similarly, let’s visualize the bars with most number of one star reviews:

#library()
stars_1 <- barreview %>%
  filter(stars ==1|stars==1.5) %>%
  group_by(business_id) %>%
  select(business_id,stars,text) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  ungroup() 


one = merge(stars_1, bar, by= "business_id")
one2 <- one %>%
  filter(stars ==1|stars==1.5) %>%
  filter(is_open==1)

onestar <- one2 %>%
  arrange(stars) %>%
  head(10)

fillColor2 = "#F1C40F"

onestar %>%
  mutate(name = reorder(name,Count)) %>%
  ggplot(aes(x = name,y = Count)) +
  geom_bar(stat='identity',colour="white", fill = fillColor2) +
  geom_text(aes(x = name, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 2, colour = 'black',
            fontface = 'bold') +
  labs(x = 'Name of the Bars', 
       y = 'Count', 
       title = 'Name of the bars and Count') +
  coord_flip() +
  theme_bw()

Surprisingly, the bar named “Triumph property management” only has one star rating, and there are 12 reviews on that bar.

So we are interested to see the common words of reviews on “Triumph property management”:

createWordCloud = function(train)
{
  train %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word) %>%
  count(word,sort = TRUE) %>%
  ungroup()  %>%
  head(30) %>%
  
  with(wordcloud(word, n, max.words = 30,colors=brewer.pal(8, "Dark2")))
}

createWordCloud(barreview %>%
  filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ"))

People in their reviews complain about the house/environment of the bar.

Let’s create a datatable to see some information regarding “Triumph property management”:

Triumph = bar %>% filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ") %>%
  select(name,neighborhood,city,state,postal_code,categories)

datatable(head(Triumph), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

define 5-star and 1-star bar datasets with reviews

goodbar <- barreview %>%
  filter(stars == 5) %>%
  group_by(business_id) %>%
  ungroup()

star_five <- merge(goodbar,bar,by = "business_id")

badbar <- barreview %>%
  filter(stars == 1) %>%
  group_by(business_id) %>%
  ungroup()

star_one <- merge(badbar,bar,by = "business_id")

preprocessing reviews

df_five = data.frame(doc_id = star_five$business_id, text = star_five$text,stringsAsFactors = F)
star_five2 <- DataframeSource(df_five)
star_five2 <- VCorpus(star_five2)

df_one = data.frame(doc_id = star_one$business_id, text = star_one$text,stringsAsFactors = F)
star_one2 <- DataframeSource(df_one)
star_one2 <- VCorpus(star_one2)
#Remove unnecessary words(stop words), synatx, punctuation, numbers, white space etc.
remove_nonalphanum <- function(x){str_replace_all(x, "[^[:alnum:]]", " ")}
remove_brandnames <- function(x){str_replace_all(x, "\\b[A-Z]+\\b", " ")}

clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, content_transformer(remove_nonalphanum))
  corpus <- tm_map(corpus, content_transformer(remove_brandnames))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
}
#cleaning two datasets
star_five_clean <- clean_corpus(star_five2)
star_one_clean <- clean_corpus(star_one2)
#create a document-term-matrix:
#create the dtm from the corpus
corpus_five_dtm <- DocumentTermMatrix(star_five_clean)
corpus_one_dtm <- DocumentTermMatrix(star_one_clean)
#provide a word cloud of the most frequent words for "five_star" bars and "one_star" bars
corpus_five_dt <- tidy(corpus_five_dtm)
corpus_one_dt <- tidy(corpus_one_dtm) 
head(corpus_five_dt)
## # A tibble: 6 x 3
##   document               term     count
##   <chr>                  <chr>    <dbl>
## 1 --q7kSBRb0vWC8lSkXFByA bit          1
## 2 --q7kSBRb0vWC8lSkXFByA blvd         1
## 3 --q7kSBRb0vWC8lSkXFByA drive        1
## 4 --q7kSBRb0vWC8lSkXFByA football     1
## 5 --q7kSBRb0vWC8lSkXFByA games        1
## 6 --q7kSBRb0vWC8lSkXFByA great        3
#tf-idf
corpus_five_tdidf <- corpus_five_dt %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf_idf))
## Warning: A value for tf_idf is negative:
##  Input should have exactly one row per document-term combination.
corpus_one_tdidf <- corpus_one_dt %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf_idf))
## Warning: A value for tf_idf is negative:
##  Input should have exactly one row per document-term combination.
head(corpus_five_tdidf)
## # A tibble: 6 x 6
##   document               term                count     tf   idf tf_idf
##   <chr>                  <chr>               <dbl>  <dbl> <dbl>  <dbl>
## 1 41n0hPBRTHmZLKXAhGx9vA townaaaa                1 0.25    7.19  1.80 
## 2 JZG8d5KfxedhYEDJUXGAxw abdul                   1 0.111   7.19  0.799
## 3 0JmCGyAqAK_yVEuZ0ts16w bilbos                  1 0.1     7.19  0.719
## 4 s9RzET4jnh3pFhAYEqRgNA seriouslybartenders     1 0.0909  7.19  0.653
## 5 dd3lBUGkokm3P1ChWJnYuw freds                   1 0.1     5.80  0.580
## 6 WM5lY9Yp9FFr7dsC-5sNYA jillians                2 0.0833  6.50  0.541
term_frequency_DT_five <- corpus_five_tdidf
term_frequency_DT_one <- corpus_one_tdidf

library(wordcloud)
#Set seed
set.seed(2103)

#create a wordcloud to show the frequent words of five stars bars
wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf,
          max.words=50, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## hour could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## happy could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## sportsbook could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## guitar could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## best could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf, :
## townaaaa could not be fit on page. It will not be plotted.

#create a wordcloud to show the frequent words of one stars bars
wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf,
          max.words=50, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## behind could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## march could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## plain could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## longer could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## open could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## service could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## sadden could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## either could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## wake could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## management could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, : cab
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## place could not be fit on page. It will not be plotted.
## Warning in wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf, :
## closed could not be fit on page. It will not be plotted.

A pyramid plot to show how the words between five-stars and one-stars bars differ in word frequency:

#combine corpus of the most successful and unsuccessful projects
#select top 20 words
corpus_five_dt$bestworst <- "Top"
corpus_one_dt$bestworst <- "Bottom"
corpus_top_bottom_dt <- rbind(corpus_five_dt,corpus_one_dt)


corpus_top_bottom_count <- corpus_top_bottom_dt %>%
  group_by(term) %>%
  summarize(total_word = sum(count)) %>%
  arrange(desc(total_word)) %>%
  head(20)
pyramid = left_join(corpus_top_bottom_dt, corpus_top_bottom_count, by='term')

pyramid <- pyramid %>%
  filter(!is.na(total_word)) %>%
  group_by(bestworst) %>%
  mutate(count_plot = ifelse(bestworst == 'Bottom', count*(-1), count))
ggplot(pyramid, aes(x = reorder(term, total_word),
                  y = count_plot, fill = bestworst)) +
  geom_bar(data = filter(pyramid, bestworst == "Top"), stat = "identity") +
  geom_bar(data = filter(pyramid, bestworst == "Bottom"), stat = "identity") +
  scale_fill_brewer(palette = "Set1", direction=-1) +
  coord_flip() +
  scale_y_continuous(breaks = seq(-50,50,25)) +
  scale_fill_discrete(name = 'bars star rating', labels=c('one star', 'five star')) +
  ylab("") +
  ggthemes::theme_tufte() + 
  labs(
    x = 'Top 20 Words',
    y= 'Count',
    title = 'Pyramid Plot of Top 20 Words, for one star bars and five star bars'
  )
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

Sentiment analysis of reviews:

Positive v.s. negative words in the reivews of J Karaoke Bar

positiveWordsBarGraph <- function(SC) {
  contributions <- SC %>%
    unnest_tokens(word, text) %>%
    count(word,sort = TRUE) %>%
    ungroup() %>%
    
    inner_join(get_sentiments("afinn"), by = "word") %>%
    group_by(word) %>%
    summarize(occurences = n(),
              contribution = sum(value))

  
  contributions %>%
    top_n(20, abs(contribution)) %>%
    mutate(word = reorder(word, contribution)) %>%
    head(20) %>%
    ggplot(aes(word, contribution, fill = contribution > 0)) +
    geom_col(show.legend = FALSE) +
    coord_flip() + theme_bw()
}

positiveWordsBarGraph(barreview %>%
                       filter(business_id == "3pSUr_cdrphurO6m1HMP9A"))

calculate sentiment for “J Karaoke Bar”

J_Karaoke_reviews = star_five %>%
  filter(business_id == "3pSUr_cdrphurO6m1HMP9A")

calculate_sentiment <- function(review)
{
  sentiment_lines  =  review %>%
                  unnest_tokens(word, text) %>%
                  inner_join(get_sentiments("afinn"), by = "word") %>%
                  group_by(user_id) %>%
                  summarize(sentiment = mean(value),words = n()) %>%
                  ungroup() %>%
                  filter(words >= 10) 

  return(sentiment_lines)
  
}


sentiment_lines = calculate_sentiment(J_Karaoke_reviews)

head(sentiment_lines)
## # A tibble: 6 x 3
##   user_id                sentiment words
##   <chr>                      <dbl> <int>
## 1 2wxtnu-tA8i9HjHD55iU6g     1.91     11
## 2 3iocNPlPThAG2ZaNtUo4TQ     0.733    30
## 3 6vmfgZN3IC3toXRQdmKtBg     1.19     16
## 4 8dxkcmAXY4ttrVFD1GhbdQ     1.67     21
## 5 aF0BTeTVRXv4OHYXMNH7SQ     2.12     17
## 6 AVkp6nU-94QuRYZDxO6kpQ     1.54     13

Display top 10 most positive reviews for 5 star bars:

display_pos_sentiments <- function(sentiment_lines,review_text)
{
  pos_sentiment_lines = sentiment_lines %>%
  arrange(desc(sentiment))  %>%
  top_n(10, sentiment) %>%
  inner_join(review_text, by = "user_id") %>%
  select(date,sentiment,text) 
  
datatable(pos_sentiment_lines, style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

}

display_pos_sentiments(sentiment_lines,J_Karaoke_reviews)

Positive v.s. negative words in the reivews of Triumph property management

positiveWordsBarGraph <- function(SC) {
  contributions <- SC %>%
    unnest_tokens(word, text) %>%
    count(word,sort = TRUE) %>%
    ungroup() %>%
    
    inner_join(get_sentiments("afinn"), by = "word") %>%
    group_by(word) %>%
    summarize(occurences = n(),
              contribution = sum(value))

  
  contributions %>%
    top_n(20, abs(contribution)) %>%
    mutate(word = reorder(word, contribution)) %>%
    head(20) %>%
    ggplot(aes(word, contribution, fill = contribution > 0)) +
    geom_col(show.legend = FALSE) +
    coord_flip() + theme_bw()
}

positiveWordsBarGraph(barreview %>%
                       filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ"))

Triumph_reviews = barreview %>%
  filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ")

calculate_sentiment <- function(review)
{
  sentiment_lines  =  review %>%
                  unnest_tokens(word, text) %>%
                  inner_join(get_sentiments("afinn"), by = "word") %>%
                  group_by(user_id) %>%
                  summarize(sentiment = mean(value),words = n()) %>%
                  ungroup() %>%
                  filter(words >= 10) 

  return(sentiment_lines)
  
}


sentiment_lines = calculate_sentiment(Triumph_reviews)

Display top 10 most negative reviews for Triumph property management:

display_neg_sentiments <- function(sentiment_lines,review_text)
{
  neg_sentiment_lines = sentiment_lines %>%
  arrange(desc(sentiment))  %>%
  top_n(-10, sentiment) %>%
  inner_join(review_text, by = "user_id") %>%
  select(date,sentiment,text) 
  
datatable(neg_sentiment_lines, style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
}

display_neg_sentiments(sentiment_lines,Triumph_reviews)

Visualize the geographical location of the top 10 five star bars(blue dot) and bottom 1 star or 1.5 star bars(yellow dot):

library(devtools)
## Loading required package: usethis
devtools::install_github("rstudio/leaflet")
## Downloading GitHub repo rstudio/leaflet@HEAD
## cli          (3.2.0 -> 3.3.0) [CRAN]
## vctrs        (0.4.0 -> 0.4.1) [CRAN]
## RColorBrewer (1.1-2 -> 1.1-3) [CRAN]
## tibble       (3.1.6 -> 3.1.7) [CRAN]
## scales       (1.1.1 -> 1.2.0) [CRAN]
## ggplot2      (3.3.5 -> 3.3.6) [CRAN]
## sp           (1.4-6 -> 1.4-7) [CRAN]
## Installing 7 packages: cli, vctrs, RColorBrewer, tibble, scales, ggplot2, sp
## Warning: packages 'RColorBrewer', 'tibble', 'ggplot2' are in use and will not be
## installed
## Installing packages into 'C:/Users/DELL/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'cli' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'cli'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\DELL\Documents\R\win-library\4.1\00LOCK\cli\libs\x64\cli.dll to C:
## \Users\DELL\Documents\R\win-library\4.1\cli\libs\x64\cli.dll: Permission denied
## Warning: restored 'cli'
## package 'vctrs' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'vctrs'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\DELL\Documents\R\win-library\4.1\00LOCK\vctrs\libs\x64\vctrs.dll to C:
## \Users\DELL\Documents\R\win-library\4.1\vctrs\libs\x64\vctrs.dll: Permission
## denied
## Warning: restored 'vctrs'
## package 'scales' successfully unpacked and MD5 sums checked
## package 'sp' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'sp'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\DELL\Documents\R\win-library\4.1\00LOCK\sp\libs\x64\sp.dll to C:
## \Users\DELL\Documents\R\win-library\4.1\sp\libs\x64\sp.dll: Permission denied
## Warning: restored 'sp'
## 
## The downloaded binary packages are in
##  C:\Users\DELL\AppData\Local\Temp\RtmpaurNEv\downloaded_packages
## * checking for file 'C:\Users\DELL\AppData\Local\Temp\RtmpaurNEv\remotes2a2449137a8\rstudio-leaflet-0016c07/DESCRIPTION' ... OK
## * preparing 'leaflet':
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## Removed empty directory 'leaflet/docs/libs'
## Removed empty directory 'leaflet/docs'
## Removed empty directory 'leaflet/man-roxygen'
## * building 'leaflet_2.1.1.tar.gz'
## 
## Warning: package 'leaflet' is in use and will not be installed
LasvegasCoords = bar %>% filter(city == "Las Vegas")
center_lon = median(LasvegasCoords$longitude,na.rm = TRUE)
center_lat = median(LasvegasCoords$latitude,na.rm = TRUE)

map <- leaflet(rbind(fivestar,onestar)) %>%
  addProviderTiles("Esri.NatGeoWorldMap") %>%
  addCircles(lng = ~longitude, lat = ~latitude,radius = ~sqrt(review_count))  %>%
  addCircleMarkers(data=fivestar,col="blue",group="fivestar") %>%
  addCircleMarkers(data=onestar,color='yellow',group="onestar") %>%
    #Layers control
  addLayersControl(overlayGroups = c("fivestar","onestar"),
                   options = layersControlOptions(collapsed = FALSE)
                   ) %>%
  # controls
  setView(lng=center_lon, lat=center_lat,zoom = 13)
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
map

Top five stars bars are more centralized and concentrated; while the worst one star bars are relatively more sparse in their location, and are not close to transportation hub.